In [1]:
import plotly.graph_objects as go
import pandas as pd
import plotly.express as px
import numpy as np
import scipy as sy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.units as munits
import matplotlib as mpl
from scipy.special import ndtri
import warnings
import scipy.stats as st
import statsmodels.api as sm
import matplotlib
In [2]:
df = pd.read_csv('Time series_modified.csv', header=2, sep=',') #read timeseries data from Inagro agrodig
df
Out[2]:
Hour Day Vedows_grass combo Actual_grass PM Total Biogas
0 0 26/03/2020 57.0 21.09 NaN 57 2.89
1 1 26/03/2020 39.0 14.43 NaN 39 2.89
2 2 26/03/2020 58.0 21.46 NaN 58 2.90
3 3 26/03/2020 67.0 24.79 NaN 67 2.93
4 4 26/03/2020 39.0 14.43 NaN 39 2.96
... ... ... ... ... ... ... ...
979 19 5/05/2020 NaN 0.00 125.0 125 7.55
980 20 5/05/2020 NaN 0.00 NaN 0 7.55
981 21 5/05/2020 NaN 0.00 125.0 125 3.86
982 22 5/05/2020 NaN 0.00 NaN 0 2.68
983 23 5/05/2020 NaN 0.00 125.0 125 7.15

984 rows × 7 columns

In [3]:
#creating a dataframe with timestamp
t_index = pd.DatetimeIndex(pd.date_range(start='2020-03-26', end='2020-05-05 23:00:00', freq="1h"))
time = pd.DataFrame(t_index, columns=['hour'])
time
Out[3]:
hour
0 2020-03-26 00:00:00
1 2020-03-26 01:00:00
2 2020-03-26 02:00:00
3 2020-03-26 03:00:00
4 2020-03-26 04:00:00
... ...
979 2020-05-05 19:00:00
980 2020-05-05 20:00:00
981 2020-05-05 21:00:00
982 2020-05-05 22:00:00
983 2020-05-05 23:00:00

984 rows × 1 columns

In [4]:
extracted_col = df[df.columns[2:9]]
extracted_col
Out[4]:
Vedows_grass combo Actual_grass PM Total Biogas
0 57.0 21.09 NaN 57 2.89
1 39.0 14.43 NaN 39 2.89
2 58.0 21.46 NaN 58 2.90
3 67.0 24.79 NaN 67 2.93
4 39.0 14.43 NaN 39 2.96
... ... ... ... ... ...
979 NaN 0.00 125.0 125 7.55
980 NaN 0.00 NaN 0 7.55
981 NaN 0.00 125.0 125 3.86
982 NaN 0.00 NaN 0 2.68
983 NaN 0.00 125.0 125 7.15

984 rows × 5 columns

In [5]:
df1 = time.join(extracted_col)
df1.columns
Out[5]:
Index(['hour', 'Vedows_grass combo', 'Actual_grass', 'PM', 'Total', 'Biogas'], dtype='object')
In [6]:
df1['hour'] = pd.to_datetime(time['hour'])
df1
Out[6]:
hour Vedows_grass combo Actual_grass PM Total Biogas
0 2020-03-26 00:00:00 57.0 21.09 NaN 57 2.89
1 2020-03-26 01:00:00 39.0 14.43 NaN 39 2.89
2 2020-03-26 02:00:00 58.0 21.46 NaN 58 2.90
3 2020-03-26 03:00:00 67.0 24.79 NaN 67 2.93
4 2020-03-26 04:00:00 39.0 14.43 NaN 39 2.96
... ... ... ... ... ... ...
979 2020-05-05 19:00:00 NaN 0.00 125.0 125 7.55
980 2020-05-05 20:00:00 NaN 0.00 NaN 0 7.55
981 2020-05-05 21:00:00 NaN 0.00 125.0 125 3.86
982 2020-05-05 22:00:00 NaN 0.00 NaN 0 2.68
983 2020-05-05 23:00:00 NaN 0.00 125.0 125 7.15

984 rows × 6 columns

In [7]:
daily = df1.resample('D', on='hour').sum()
daily.describe()
#daily.to_csv('TS_inagro_disagg.csv', sep=',')
Out[7]:
Vedows_grass combo Actual_grass PM Total Biogas
count 41.000000 41.000000 41.000000 41.000000 41.000000
mean 895.365854 331.285366 1037.560976 1932.926829 105.335610
std 690.628509 255.532548 759.257139 1108.583655 31.822892
min 0.000000 0.000000 0.000000 0.000000 58.940000
25% 0.000000 0.000000 575.000000 1365.000000 72.050000
50% 1125.000000 416.250000 1315.000000 2376.000000 115.330000
75% 1464.000000 541.680000 1380.000000 2628.000000 133.020000
max 1925.000000 712.250000 3920.000000 5151.000000 156.860000
In [8]:
df2 = daily.reset_index()
df2.rename({'hour': 'Day'}, axis=1, inplace=True)
df2.describe()
Out[8]:
Vedows_grass combo Actual_grass PM Total Biogas
count 41.000000 41.000000 41.000000 41.000000 41.000000
mean 895.365854 331.285366 1037.560976 1932.926829 105.335610
std 690.628509 255.532548 759.257139 1108.583655 31.822892
min 0.000000 0.000000 0.000000 0.000000 58.940000
25% 0.000000 0.000000 575.000000 1365.000000 72.050000
50% 1125.000000 416.250000 1315.000000 2376.000000 115.330000
75% 1464.000000 541.680000 1380.000000 2628.000000 133.020000
max 1925.000000 712.250000 3920.000000 5151.000000 156.860000
In [9]:
df_cumulative = daily[daily.columns[0:9]].cumsum()
#df_cumulative.to_csv('TS_inagro.csv', sep=',')
df_cumulative.columns
Out[9]:
Index(['Vedows_grass combo', 'Actual_grass', 'PM', 'Total', 'Biogas'], dtype='object')
In [10]:
df_cumulative.describe()
Out[10]:
Vedows_grass combo Actual_grass PM Total Biogas
count 41.000000 41.000000 41.000000 41.000000 41.000000
mean 13404.853659 4959.795854 20138.170732 33543.024390 1884.378293
std 12647.412895 4679.542771 13798.429927 26091.855225 1301.674017
min 1427.000000 527.990000 0.000000 1427.000000 70.240000
25% 1486.000000 549.820000 7860.000000 9346.000000 772.290000
50% 9220.000000 3411.400000 20430.000000 29650.000000 1642.310000
75% 24338.000000 9005.060000 33990.000000 58328.000000 2953.240000
max 36710.000000 13582.700000 42540.000000 79250.000000 4318.760000
In [11]:
df2.columns
df2.describe()
Out[11]:
Vedows_grass combo Actual_grass PM Total Biogas
count 41.000000 41.000000 41.000000 41.000000 41.000000
mean 895.365854 331.285366 1037.560976 1932.926829 105.335610
std 690.628509 255.532548 759.257139 1108.583655 31.822892
min 0.000000 0.000000 0.000000 0.000000 58.940000
25% 0.000000 0.000000 575.000000 1365.000000 72.050000
50% 1125.000000 416.250000 1315.000000 2376.000000 115.330000
75% 1464.000000 541.680000 1380.000000 2628.000000 133.020000
max 1925.000000 712.250000 3920.000000 5151.000000 156.860000
In [12]:
df2
Out[12]:
Day Vedows_grass combo Actual_grass PM Total Biogas
0 2020-03-26 1427.0 527.99 0.0 1427 70.24
1 2020-03-27 59.0 21.83 0.0 59 71.27
2 2020-03-28 0.0 0.00 0.0 0 72.05
3 2020-03-29 0.0 0.00 960.0 960 73.79
4 2020-03-30 0.0 0.00 805.0 805 73.41
5 2020-03-31 0.0 0.00 0.0 0 73.07
6 2020-04-01 0.0 0.00 575.0 575 72.18
7 2020-04-02 0.0 0.00 1380.0 1380 70.03
8 2020-04-03 0.0 0.00 1380.0 1380 68.23
9 2020-04-04 0.0 0.00 1380.0 1380 65.45
10 2020-04-05 0.0 0.00 1380.0 1380 62.57
11 2020-04-06 0.0 0.00 1380.0 1380 61.62
12 2020-04-07 0.0 0.00 720.0 720 60.76
13 2020-04-08 0.0 0.00 0.0 0 58.94
14 2020-04-09 0.0 0.00 640.0 640 59.72
15 2020-04-10 951.0 351.87 1610.0 2561 83.45
16 2020-04-11 1231.0 455.47 3920.0 5151 101.71
17 2020-04-12 1026.0 379.62 1380.0 2406 108.19
18 2020-04-13 1302.0 481.74 1380.0 2682 104.55
19 2020-04-14 1698.0 628.26 690.0 2388 115.33
20 2020-04-15 1526.0 564.62 850.0 2376 115.75
21 2020-04-16 1435.0 530.95 1380.0 2815 119.62
22 2020-04-17 1715.0 634.55 1380.0 3095 127.92
23 2020-04-18 1578.0 583.86 1610.0 3188 133.00
24 2020-04-19 1103.0 408.11 1380.0 2483 127.37
25 2020-04-20 1925.0 712.25 1610.0 3535 140.70
26 2020-04-21 1289.0 476.93 1610.0 2899 143.93
27 2020-04-22 1239.0 458.43 1275.0 2514 137.03
28 2020-04-23 1586.0 586.82 900.0 2486 133.23
29 2020-04-24 1775.0 656.75 1315.0 3090 135.36
30 2020-04-25 1473.0 545.01 1100.0 2573 112.77
31 2020-04-26 1785.0 660.45 435.0 2220 156.86
32 2020-04-27 1532.0 566.84 0.0 1532 133.02
33 2020-04-28 1464.0 541.68 0.0 1464 129.20
34 2020-04-29 1365.0 505.05 0.0 1365 132.76
35 2020-04-30 1125.0 416.25 125.0 1250 127.42
36 2020-05-01 1161.0 429.57 1500.0 2661 129.51
37 2020-05-02 964.0 356.68 1625.0 2589 136.15
38 2020-05-03 878.0 324.86 1750.0 2628 134.91
39 2020-05-04 1169.0 432.53 1500.0 2669 136.85
40 2020-05-05 929.0 343.73 1615.0 2544 148.84
In [13]:
fig = px.line(df2, x=df2.Day, y=df2.Biogas, title="Daily biogas production")
fig.show()
In [14]:
fig1 = px.scatter(df2, x='Day', y='Total', color='Biogas', size='Actual_grass')
fig1.show()
In [15]:
fig = px.scatter(df2, x='Day', y='Biogas', color='Total', size='Actual_grass')
fig.show()
In [16]:
fig2 = px.line(df2, x='Day', y='Total')
fig2.show()
In [17]:
data = df2.drop(['Day', 'Vedows_grass combo', 'Actual_grass', 'PM', 'Total'], axis=1)
data
Out[17]:
Biogas
0 70.24
1 71.27
2 72.05
3 73.79
4 73.41
5 73.07
6 72.18
7 70.03
8 68.23
9 65.45
10 62.57
11 61.62
12 60.76
13 58.94
14 59.72
15 83.45
16 101.71
17 108.19
18 104.55
19 115.33
20 115.75
21 119.62
22 127.92
23 133.00
24 127.37
25 140.70
26 143.93
27 137.03
28 133.23
29 135.36
30 112.77
31 156.86
32 133.02
33 129.20
34 132.76
35 127.42
36 129.51
37 136.15
38 134.91
39 136.85
40 148.84
In [18]:
# Create models from data
def best_fit_distribution(data, bins=200, ax=None):
    """Model data by finding best fit distribution to data"""
    # Get histogram of original data
    y, x = np.histogram(data, bins=bins, density=True)
    x = (x + np.roll(x, -1))[:-1] / 2.0

    # Distributions to check
    DISTRIBUTIONS = [        
        st.dgamma,st.dweibull,st.expon,st.exponnorm,st.gamma,st.lognorm,st.norm,st.t,st.triang,
        st.uniform
    ]

    # Best holders
    best_distribution = st.norm
    best_params = (0.0, 1.0)
    best_sse = np.inf

    # Estimate distribution parameters from data
    for distribution in DISTRIBUTIONS:

        # Try to fit the distribution
        try:
            # Ignore warnings from data that can't be fit
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore')

                # fit dist to data
                params = distribution.fit(data)

                # Separate parts of parameters
                arg = params[:-2]
                loc = params[-2]
                scale = params[-1]

                # Calculate fitted PDF and error with fit in distribution
                pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
                sse = np.sum(np.power(y - pdf, 2.0))

                # if axis pass in add to plot
                try:
                    if ax:
                        pd.Series(pdf, x).plot(ax=ax)
                    end
                except Exception:
                    pass

                # identify if this distribution is better
                if best_sse > sse > 0:
                    best_distribution = distribution
                    best_params = params
                    best_sse = sse

        except Exception:
            pass

    return (best_distribution.name, best_params)

def make_pdf(dist, params, size=10000):
    """Generate distributions's Probability Distribution Function """

    # Separate parts of parameters
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]

    # Get sane start and end points of distribution
    start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)
    end = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)

    # Build PDF and turn into pandas Series
    x = np.linspace(start, end, size)
    y = dist.pdf(x, loc=loc, scale=scale, *arg)
    pdf = pd.Series(y, x)

    return pdf
In [19]:
# Plot for comparison
plt.figure(figsize=(20,15))
ax = data.plot(kind='hist', bins=50, density=True, alpha=0.5, color = [color['color'] for color in list(mpl.rcParams['axes.prop_cycle'])])
# Save plot limits
dataYLim = ax.get_ylim()

# Find best fit distribution
best_fit_name, best_fit_params = best_fit_distribution(data, 200, ax)
best_dist = getattr(st, best_fit_name)

# Update plots
ax.set_ylim(dataYLim)
ax.set_title(u'Biogas yield.\n Vanheede ')
ax.set_xlabel(u'Biogas yield (m3/day)')
ax.set_ylabel('Frequency')

# Make PDF with best params 
pdf = make_pdf(best_dist, best_fit_params)

# Display
plt.figure(figsize=(20,15))
ax = pdf.plot(lw=2, label='PDF', legend=True)
data.plot(kind='hist', bins=50, density=True, alpha=0.5, label='Data', legend=True, ax=ax)

param_names = (best_dist.shapes + ', loc, scale').split(', ') if best_dist.shapes else ['loc', 'scale']
param_str = ', '.join(['{}={:0.2f}'.format(k,v) for k,v in zip(param_names, best_fit_params)])
dist_str = '{}({})'.format(best_fit_name, param_str)

ax.set_title(u'Biogas yield. with best fit distribution \n' + dist_str)
ax.set_xlabel(u'Biogas yield (m3/day)')
ax.set_ylabel('Frequency')
Out[19]:
Text(0, 0.5, 'Frequency')
<Figure size 1440x1080 with 0 Axes>
In [ ]: